Coverage Report

Created: 2026-06-19 16:17

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
D:\a\csshw\csshw\xtask\src\typography.rs
Line
Count
Source
1
//! Typography linter that blocks decorative or "smart" Unicode
2
//! punctuation from sneaking into the repository.
3
//!
4
//! Agents tend to introduce em-dashes, en-dashes, smart quotes,
5
//! ellipsis, arrows, and similar non-ASCII glyphs in comments and
6
//! prose. They look similar to their ASCII equivalents but are not
7
//! what a Windows developer types and not what `cargo fmt` produces.
8
//!
9
//! [`check_typography`] enumerates tracked text files via
10
//! `git ls-files`, scans each for a curated blocklist of code points,
11
//! prints any violations as `path:line:col U+XXXX 'glyph'`, and
12
//! returns an error when at least one violation is found so the
13
//! pre-commit hook and CI both abort.
14
//!
15
//! Performance: the scan runs inside the pre-commit hook, so the
16
//! hot path reads bytes, exits early on pure-ASCII input, and only
17
//! decodes UTF-8 for files that actually contain non-ASCII bytes.
18
19
use std::path::{Path, PathBuf};
20
21
use anyhow::{bail, Context, Result};
22
23
/// File extensions whose contents are scanned.
24
///
25
/// All matching is done in lowercase. Files with no extension are
26
/// scanned only when their path matches [`SCAN_EXTRA_PATHS`].
27
const SCAN_EXTENSIONS: &[&str] = &[
28
    "rs", "md", "toml", "yml", "yaml", "json", "html", "txt", "cfg", "sh", "ps1", "js", "mjs",
29
];
30
31
/// Tracked paths without a recognised extension that should still be
32
/// scanned (shell scripts, hooks, etc.). Compared against the
33
/// `git ls-files` output verbatim (forward slashes).
34
const SCAN_EXTRA_PATHS: &[&str] = &[".githooks/pre-commit"];
35
36
/// Tracked paths that are explicitly excluded from scanning. Used for:
37
///
38
/// - generated artefacts such as `Cargo.lock`,
39
/// - files (such as the `CHANGELOG.md`) that may legitimately preserve
40
///   historical typography from prior releases,
41
/// - templates and workflow snippets whose non-ASCII content is
42
///   intentional and rendered to users (e.g. social-preview titles,
43
///   GitHub Pages footers, PR-comment heredocs).
44
///
45
/// Keep this list short -- the goal is to fix offending content, not
46
/// to allowlist around it. Compared against the `git ls-files` output
47
/// verbatim (forward slashes).
48
const ALLOWED_PATHS: &[&str] = &[
49
    "Cargo.lock",
50
    ".github/workflows/news-fragment-check.yml",
51
    "templates/github-pages-index.html",
52
    "templates/social-preview.html",
53
];
54
55
/// Hard cap on file size accepted by the scanner. Anything larger is
56
/// skipped with a warning -- the repo has nothing close to this size,
57
/// and a pathological large file should not block a commit.
58
const MAX_FILE_BYTES: u64 = 5 * 1024 * 1024;
59
60
/// All side-effecting operations performed by the typography scanner.
61
///
62
/// Implement with mocks in tests to achieve zero filesystem and
63
/// process side-effects.
64
pub trait TypographySystem {
65
    /// Return the list of tracked files reported by `git ls-files`.
66
    ///
67
    /// Paths are returned with forward slashes (the format `git`
68
    /// emits on every platform).
69
    ///
70
    /// # Errors
71
    ///
72
    /// Returns an error if the `git` process cannot be started or
73
    /// exits non-zero.
74
    fn list_tracked_files(&self) -> Result<Vec<String>>;
75
76
    /// Return the size in bytes of the file at `path`.
77
    ///
78
    /// # Errors
79
    ///
80
    /// Returns an error if the file cannot be stat-ed.
81
    fn file_size(&self, path: &Path) -> Result<u64>;
82
83
    /// Read the full contents of the file at `path` as raw bytes.
84
    ///
85
    /// # Errors
86
    ///
87
    /// Returns an error if the file cannot be read.
88
    fn read_file(&self, path: &Path) -> Result<Vec<u8>>;
89
90
    /// Emit a message to the user (informational or warning).
91
    ///
92
    /// # Arguments
93
    ///
94
    /// * `msg` - Message to display.
95
    fn log(&self, msg: &str);
96
}
97
98
/// Production implementation of [`TypographySystem`].
99
pub struct RealSystem;
100
101
#[cfg_attr(coverage_nightly, coverage(off))]
102
impl TypographySystem for RealSystem {
103
    fn list_tracked_files(&self) -> Result<Vec<String>> {
104
        let output = std::process::Command::new("git")
105
            .args(["ls-files"])
106
            .output()
107
            .context("failed to run `git ls-files`")?;
108
        if !output.status.success() {
109
            bail!(
110
                "`git ls-files` exited non-zero: {}",
111
                String::from_utf8_lossy(&output.stderr)
112
            );
113
        }
114
        let stdout =
115
            String::from_utf8(output.stdout).context("`git ls-files` produced non-UTF-8 output")?;
116
        Ok(stdout
117
            .lines()
118
            .filter(|line| !line.is_empty())
119
            .map(|line| line.to_owned())
120
            .collect())
121
    }
122
123
    fn file_size(&self, path: &Path) -> Result<u64> {
124
        let meta = std::fs::metadata(path)
125
            .with_context(|| format!("failed to stat {}", path.display()))?;
126
        Ok(meta.len())
127
    }
128
129
    fn read_file(&self, path: &Path) -> Result<Vec<u8>> {
130
        std::fs::read(path).with_context(|| format!("failed to read {}", path.display()))
131
    }
132
133
    fn log(&self, msg: &str) {
134
        eprintln!("{msg}");
135
    }
136
}
137
138
/// A single offending code point found in a scanned file.
139
#[derive(Debug, Clone, PartialEq, Eq)]
140
pub struct Violation {
141
    /// Repository-relative path with forward slashes.
142
    pub path: String,
143
    /// 1-based line number of the offending character.
144
    pub line: u32,
145
    /// 1-based column (counted in `char`s, not bytes) of the offending
146
    /// character.
147
    pub column: u32,
148
    /// The offending Unicode scalar value.
149
    pub character: char,
150
}
151
152
/// Return `true` when `c` should be flagged by the scanner.
153
///
154
/// The blocklist is hand-curated to cover the decorative glyphs that
155
/// LLMs habitually substitute for ASCII punctuation. Emoji and other
156
/// non-ASCII characters are deliberately not included.
157
///
158
/// # Arguments
159
///
160
/// * `c` - Character to test.
161
///
162
/// # Returns
163
///
164
/// `true` when `c` is on the blocklist, `false` otherwise.
165
202
pub fn is_blocklisted(c: char) -> bool {
166
202
    let cp = c as u32;
167
186
    matches!(
168
202
        cp,
169
        // Non-breaking and middle-dot, multiplication, division.
170
        0x00A0 | 0x00B7 | 0x00D7 | 0x00F7
171
        // Exotic spaces.
172
18
        | 0x2000..=0x200B
173
        | 0x202F | 0x205F | 0x3000
174
        // Hyphens, en/em-dashes, horizontal bar, minus sign.
175
18
        | 0x2010..=0x2015 | 0x2212
176
        // Smart single and double quotes.
177
13
        | 0x2018..=0x201F
178
        // Bullet, ellipsis.
179
        | 0x2022 | 0x2026
180
        // Arrows block in its entirety.
181
9
        | 0x2190..=0x21FF
182
        // Math comparison glyphs.
183
        | 0x2248 | 0x2260 | 0x2264 | 0x2265
184
    )
185
202
}
186
187
/// Decide whether `path` should be scanned.
188
///
189
/// A file is scanned when:
190
///
191
/// 1. it is not in [`ALLOWED_PATHS`], and
192
/// 2. its lowercase extension is in [`SCAN_EXTENSIONS`], or its path
193
///    appears verbatim in [`SCAN_EXTRA_PATHS`].
194
///
195
/// # Arguments
196
///
197
/// * `path` - Forward-slash relative path as emitted by
198
///   `git ls-files`.
199
///
200
/// # Returns
201
///
202
/// `true` when the file should be scanned, `false` otherwise.
203
21
pub fn should_scan(path: &str) -> bool {
204
21
    if ALLOWED_PATHS.contains(&path) {
205
5
        return false;
206
16
    }
207
16
    if SCAN_EXTRA_PATHS.contains(&path) {
208
1
        return true;
209
15
    }
210
15
    let Some(
dot14
) = path.rfind('.') else {
211
1
        return false;
212
    };
213
14
    let ext = &path[dot + 1..];
214
14
    SCAN_EXTENSIONS
215
14
        .iter()
216
73
        .
any14
(|allowed| allowed.eq_ignore_ascii_case(ext))
217
21
}
218
219
/// Scan a single file's contents and return any violations.
220
///
221
/// Pure function -- no I/O. Files that are pure ASCII return early
222
/// before allocating or decoding UTF-8, which keeps the common case
223
/// (almost every `.rs` file in this repo) cheap.
224
///
225
/// Files that are not valid UTF-8 are reported via the returned
226
/// `non_utf8` flag and produce no violations; the caller decides
227
/// whether to surface that as a warning.
228
///
229
/// # Arguments
230
///
231
/// * `path` - Display path used when constructing violations.
232
/// * `bytes` - Raw file contents.
233
///
234
/// # Returns
235
///
236
/// `(violations, non_utf8)` where `non_utf8` is `true` if the file
237
/// could not be decoded as UTF-8.
238
10
pub fn scan_bytes(path: &str, bytes: &[u8]) -> (Vec<Violation>, bool) {
239
    // Fast path: pure ASCII -> nothing to flag.
240
112
    if 
bytes.iter()10
.
all10
(|&b| b < 0x80) {
241
3
        return (Vec::new(), false);
242
7
    }
243
244
7
    let Ok(
text5
) = std::str::from_utf8(bytes) else {
245
2
        return (Vec::new(), true);
246
    };
247
248
5
    let mut violations = Vec::new();
249
5
    let mut line: u32 = 1;
250
5
    let mut column: u32 = 1;
251
64
    for c in 
text5
.
chars5
() {
252
64
        if c == '\n' {
253
5
            line += 1;
254
5
            column = 1;
255
5
            continue;
256
59
        }
257
59
        if c == '\r' {
258
            // CRLF: do not advance the column. The following '\n' resets it.
259
0
            continue;
260
59
        }
261
59
        if is_blocklisted(c) {
262
5
            violations.push(Violation {
263
5
                path: path.to_owned(),
264
5
                line,
265
5
                column,
266
5
                character: c,
267
5
            });
268
54
        }
269
59
        column += 1;
270
    }
271
5
    (violations, false)
272
10
}
273
274
/// Scan every tracked text file and report violations.
275
///
276
/// Reads the file list via `git ls-files`, filters it through
277
/// [`should_scan`], and runs [`scan_bytes`] on each remaining file.
278
/// Violations are printed to stderr as
279
/// `path:line:col U+XXXX 'glyph'`.
280
///
281
/// # Arguments
282
///
283
/// * `system` - Injected I/O provider.
284
///
285
/// # Returns
286
///
287
/// `Ok(())` when no violations are found.
288
///
289
/// # Errors
290
///
291
/// Returns an error when at least one violation is found, or when an
292
/// I/O operation fails. Files that are too large or not valid UTF-8
293
/// are skipped with a warning and do not fail the run.
294
4
pub fn check_typography<S: TypographySystem>(system: &S) -> Result<()> {
295
4
    let files = system.list_tracked_files()
?0
;
296
4
    let mut violations: Vec<Violation> = Vec::new();
297
5
    for rel in 
files4
{
298
5
        if !should_scan(&rel) {
299
1
            continue;
300
4
        }
301
4
        let path = PathBuf::from(&rel);
302
4
        let size = system.file_size(&path)
?0
;
303
4
        if size > MAX_FILE_BYTES {
304
1
            system.log(&format!(
305
1
                "WARNING - skipping {rel}: {size} bytes exceeds {MAX_FILE_BYTES} byte cap"
306
1
            ));
307
1
            continue;
308
3
        }
309
3
        let bytes = system.read_file(&path)
?0
;
310
3
        let (mut found, non_utf8) = scan_bytes(&rel, &bytes);
311
3
        if non_utf8 {
312
1
            system.log(&format!("WARNING - skipping {rel}: not valid UTF-8"));
313
1
            continue;
314
2
        }
315
2
        violations.append(&mut found);
316
    }
317
318
4
    if violations.is_empty() {
319
3
        println!("INFO - check-typography: no forbidden Unicode found.");
320
3
        return Ok(());
321
1
    }
322
323
1
    eprintln!(
324
        "ERROR - check-typography: found {} forbidden Unicode character(s).",
325
1
        violations.len()
326
    );
327
1
    eprintln!("        Replace them with their ASCII equivalents (em/en-dashes -> '-',");
328
1
    eprintln!("        smart quotes -> ' or \", ellipsis -> ..., arrows -> -> / <-, etc.).");
329
1
    eprintln!();
330
1
    for v in &violations {
331
1
        eprintln!(
332
1
            "{}:{}:{} U+{:04X} {:?}",
333
1
            v.path, v.line, v.column, v.character as u32, v.character
334
1
        );
335
1
    }
336
1
    bail!("found {} forbidden Unicode character(s)", violations.len())
337
4
}
338
339
#[cfg(test)]
340
#[path = "tests/test_typography.rs"]
341
mod tests;